#!/usr/bin/env python3

"""Downloader of sample audio data

Configuration are in the directory downloder_conf.

Usage:
    download_speech_corpus.py <config> [-h] [-q] [-f] [-m]

Parameters:
    <config>        The path of configuration file
    -h, --help      Show this help and exit
    -q, --quiet     Don't show any messages about progress
    -f, --force     Overwrite existing corpus files
"""

import os
import re
import shutil
import urllib.parse
import urllib.request
from fnmatch import fnmatchcase
from pathlib import Path
from tempfile import TemporaryDirectory

import yaml
from docopt import docopt


class UserOption:
    """
    Class (structure) that contains flags given in Parameters
-
    Parameters
    ----------
    expected : str
        string that expresses the pattern
    """

    def __init__(self, verbose=True, force=False):
        self.verbose = verbose
        self.force = force


class FixedStrPattern:
    """
    Class of fixed-string pattern

    Uses duck-typing to commonalize interface `match` for a time.

    Parameters
    ----------
    expected : str
        string that expresses the pattern
    """

    def __init__(self, expected):
        self.expected = expected

    def match(self, target):
        """
        Check the given string matches the pattern.
        """
        return target == self.expected

    def __repr__(self):
        return "{}({})".format(self.__class__.__name__, repr(self.expected))


class PartialMatchFixedStrPattern(FixedStrPattern):
    """
    Class of partial match fixed string pattern.

    e.g.::

        PartialMatchFixedStrPattern("44").match("dataset1/44_1k/001.wav")
        => True
    """
    def match(self, target):
        return self.expected in target


class RegExPattern:
    """
    Class of regular expression pattern


    Parameters
    ----------
    expected : str
        string that expresses the pattern
    """

    def __init__(self, expected):
        self.expected = re.compile(expected)

    def match(self, target):
        # SREmatch()
        return bool(self.expected.match(target))

    def __repr__(self):
        return "{}({})".format(
            self.__class__.__name__, repr(self.expected.pattern)
        )


class GlobPattern(FixedStrPattern):
    """
    Class of glob pattern

    Like: SF* MF*

    Parameters
    ----------
    expected : str
        string that expresses the pattern
    """

    def match(self, target):
        return fnmatchcase(target, self.expected)


def generate_pattern_from_obj(pattern_obj, DefaultPattern=FixedStrPattern):
    """
    Generates an appropriate pattern from object from YAML.

    Parameters
    ----------
    pattern_obj : Union[str, Dict[str, str]]
        Parsed YAML object.
        Only one object indicated by these expressions is allowed.
        e.g.::

            pattern -> fixed string (default; changeable by 2nd argument)
            regex: pattern
            regexp: pattern
            -> regexp
            glob: pattern -> glob
    DefaultPattern
        Pattern class used when `glob` or `regex(p)` is not designated.
    Returns
    -------
    Object
        Generated pattern instance.
    """
    if isinstance(pattern_obj, str):
        return DefaultPattern(pattern_obj)
    elif isinstance(pattern_obj, dict) and len(pattern_obj) == 1:
        pattern_type, pattern = next(iter(pattern_obj.items()))  # first item
        if pattern_type in {"regex", "regexp"}:
            return RegExPattern(pattern)
        elif pattern_type == "glob":
            return GlobPattern(pattern)
        else:
            raise ValueError("Unknown pattern type: `{}`.".format(pattern))
    else:
        raise ValueError("Pattern object must be str or dictionary w/ 1 item.")


class PatternList:
    """
    List class of patterns that provides the method `match`.

    Parameters
    ----------
    patterns : List[Pattern]
        List of patterns
    """

    def __init__(self, patterns):
        self.patterns = (
            patterns if isinstance(patterns, list) else list(patterns)
        )

    @classmethod
    def from_obj(cls, patterns_obj, DefaultPattern=FixedStrPattern):
        """
        Constructs instance from object from YAML.

        objects must be one of like::

            foo

            regex: foo

            - foo
            - glob: bar
            - regexp: baz

        This is used to express `only` or `except` clauses.

        Parameters
        ----------
        patterns_obj : Union[str, Dict[str, str],
        Iterable[Union[str, Dict[str, str]]]]
            object generated by parsing YAML
        DefaultPattern
            Pattern class used when `glob` or `regex(p)` is not designated.
        """
        # foo or [pattern type]: foo
        if isinstance(patterns_obj, (str, dict)):
            return cls([generate_pattern_from_obj(
                patterns_obj,
                DefaultPattern
            )])
        else:  # itemized using list or something
            return cls(
                [
                    generate_pattern_from_obj(pattern_obj, DefaultPattern)
                    for pattern_obj in patterns_obj
                ]
            )

    def match(self, target):
        """
        Checks if the string match any of patterns.

        Parameters
        ----------
        target : str
            the string to check
        """
        return any((pattern.match(target) for pattern in self.patterns))


class ExtensionList:
    """
    Class of extension list to search for audio files in directories.

    Parameters
    ----------
    extensions : Union[List[str], str]
        extensions of audio. e.g. `wav`, [`.mp3`, `opus`], or `.m4a`.
    """

    def __init__(self, extensions):
        extensions = (
            [extensions] if isinstance(extensions, str) else extensions
        )
        if not isinstance(extensions, list):
            raise ValueError("extensions list must be an instance of list.")

        self.extensions = [
            extension.lstrip(".") for extension in extensions
        ]  # remove . from .wav for example

    def itemize_in_directory(self, directory, recurse=False):
        """
        Search for audio files with the designated extensions in the directory.

        Parameters
        ----------
        directory : Path
            The path of the directory where audio files are searched for.
        recurse : bool
            `True` if audio files in subdirectories must be searched.

        Returns
        -------
        Generator[Path, None, None]
            paths of audio files.
        """
        query_prefix = ("**/" if recurse else "") + "*."
        for extension in self.extensions:
            yield from directory.glob(query_prefix + extension)


class BaseNameFilter:
    """
    Class to filter paths of directories.

    This class looks only at basename of designated paths.

    Parameters
    ----------
    only : PatternList
        patterns directories must match
        This corresponds to `only` clauses.
    excepted : PatternList
        patterns directories must not match
        This corresponds to `except` clauses.
    """

    def __init__(self, only, excepted):
        self.only = only
        self.excepted = excepted

    def filter(self, path_list):
        """
        Filters list of paths of directories.

        Allows paths that match any of patterns in `only` clause and
        none of patterns in `except` clause to pass through.

        Parameters
        ----------
        path_list : Iterable[Path]
            list of paths

        Returns
        -------
        Generator[Path, None, None]
            list of paths
        """
        yield from filter(
            lambda path: (self.only is None or self.only.match(path.name))
            and (self.excepted is None or not self.excepted.match(path.name)),
            path_list,
        )

    @classmethod
    def from_obj(cls, only, excepted):
        """
        Generates an instance from objects genrated by parsing YAML.

        Parameters
        ----------
        only : Union[str, Dict[str, str],
        Iterable[Union[str, Dict[str, str]]]]
            parsed contents in `only` clause.
        excepted : Union[str, Dict[str, str],
        Iterable[Union[str, Dict[str, str]]]]
            parsed contents in `except` claus.
        """
        return cls(
            None if only is None else PatternList.from_obj(only),
            None if excepted is None else PatternList.from_obj(excepted),
        )


class PosixRelativePathFilter(BaseNameFilter):
    """
    This filter looks at relative POSIX-style paths
    from designated directories.
    """

    def filter(self, path_list, root_dir):
        """
        Filters list of paths of directories.

        Allows paths that match any of patterns in `only` clause and
        none of patterns in `except` clause to pass through.

        Parameters
        ----------
        path_list : Iterable[Path]
            list of paths

        root_dir : Path
            path of directory that paths in `pat_list` are converted
            to relative forms from.
            e.g. `"dataset1/001.wav"` is passed
            to `self.only` and `self.except` when::

                path_list = ["tmp/root/speaker1/dataset1/001.wav"]
                root_dir = "tmp/root/speaker1"

        Returns
        -------
        Generator[Path, None, None]
            list of paths
        """
        yield from filter(
            lambda path: (
                self.only is None or self.only.match(
                    path.relative_to(root_dir).as_posix()
                )
            )
            and (
                self.excepted is None or not self.excepted.match(
                    path.relative_to(root_dir).as_posix()
                )
            ),
            path_list,
        )

    @classmethod
    def from_obj(cls, only, excepted):
        """
        Generates an instance from objects genrated by parsing YAML.

        Parameters
        ----------
        only : Union[str, Dict[str, str],
        Iterable[Union[str, Dict[str, str]]]]
            parsed contents in `only` clause.
        excepted : Union[str, Dict[str, str],
        Iterable[Union[str, Dict[str, str]]]]
            parsed contents in `except` claus.
        """
        return cls(
            None if only is None
            else PatternList.from_obj(only, PartialMatchFixedStrPattern),
            None if excepted is None
            else PatternList.from_obj(excepted, PartialMatchFixedStrPattern),
        )


class GlobalConfiguration:
    """
    Configuration in `config` clause in configuration file.

    Parameters
    ----------
    config : Dict(YAML-parsed objects)
        Contents in `config` clause.
    """

    def __init__(self, config):
        if not isinstance(config, dict):
            raise ValueError("The argument must be a dictionary.")

        # default: only wave files are stored
        self.extensions = ExtensionList(config.get("extensions", ["wav"]))


class DataArchive:
    """
    Corresponds to each recipe in `files` clause.

    It corresponds to one archive file and audio files in it.

    Parameters
    ----------
    file_config: Dict[str, Any]
        parsed contents of each element in `files` clause.
    global_config : Dict[str, Any]
        parsed contents in `config` clause.
    user_option : UserOption
        User option of this program (verbose etc.).
    """

    def __init__(self, file_config, global_config, user_option):
        self.name = file_config["name"]
        self.src_url = file_config["src"]
        # Leading `/` throws away the path of a directory
        # where a archive file is extracted
        self.audio_root_relative = file_config["root"].lstrip("/")
        self.global_config = global_config
        self.user_option = user_option
        self.file_path_filter = BaseNameFilter.from_obj(
            file_config.get("only"), file_config.get("except")
        )
        if "each_dir" in file_config:
            self.each_dir_filter = PosixRelativePathFilter.from_obj(
                file_config["each_dir"].get("only"),
                file_config["each_dir"].get("except")
            )
            self.recurse_subdir = (
                lambda f: f if isinstance(f, bool) else False
            )(file_config["each_dir"].get("recurse"))
        else:
            self.each_dir_filter = PosixRelativePathFilter(None, None)
            self.recurse_subdir = False

    def download(self, dest_root):
        """
        Downloads archive and extracts audio files.

        Parameters
        ----------
        dest_root : Path
            the root path where directories
            that contains audio files are placed.
        """
        with TemporaryDirectory() as working_dir:
            working_dir = Path(working_dir)  # convert from str

            if re.match(r"^(https?|ftp)://", self.src_url):
                # download archive and extract files in the working directory.
                if self.user_option.verbose:
                    print(
                        "Downloading",
                        self.name,
                        "from",
                        self.src_url,
                        "..."
                    )
                archive_path = DataArchive._download_file(
                    self.src_url, working_dir
                )
            else:
                if self.src_url.startswith("file:///"):
                    # posix: file:///home/user/ -> /home/user
                    # windows: files:///C:/Users/user/ -> C:/Users/user/
                    archive_path = self.src_url.replace(
                        "file://" if os.name == "posix" else "file:///", "", 1
                    )
                else:
                    archive_path = self.src_url
                if self.user_option.verbose:
                    print("Using local file:", archive_path)

            if self.user_option.verbose:
                print("Unpack:", archive_path)
            shutil.unpack_archive(str(archive_path), str(working_dir))

            # move audio files to the destination directory.
            self._move_all_audio(
                working_dir / self.audio_root_relative, dest_root
            )

    def _move_all_audio(self, archive_root, dest_root):
        """
        Moves all audio files in archive.

        Parameters
        ----------
        archive_root : Path
            root directory of extracted archive.
        dest_root : Path
            root directory directories that contain audio files are moved to.
        """
        for directory in self.file_path_filter.filter(
            filter(
                lambda file_like: Path.is_dir(file_like),
                archive_root.iterdir(),
            )
        ):
            dest_dir = dest_root / directory.name
            self._move_audio_in_dir(directory, dest_dir)

    def _move_audio_in_dir(self, src, dest):
        """
        Moves audio files in one directory.

        Parameters
        ----------
        src : Path
            the path of the directory audio files are in.
        dest : Path
            the path of the directory audio files are moved to.
        """
        if self.user_option.verbose:
            print("Move:", src.name)
        os.makedirs(str(dest), exist_ok=True)
        for wav_file in self.each_dir_filter.filter(
            self.global_config.extensions.itemize_in_directory(
                src,
                self.recurse_subdir
            ),
            src
        ):
            # All audio files in different directories (when recurse = True)
            #   are stuffed in one directory (dest).
            dest_path = dest / wav_file.name
            self._move_file(wav_file, dest_path)

    def _move_file(self, src, dest):
        """
        Moves one file.
        """
        if dest.exists() and self.user_option.force:
            dest.unlink()
        shutil.move(str(src), str(dest))

    @staticmethod
    def _download_file(url, dest=None):
        """Download and store a remote file.

        Parameters
        ----------
        url : str or path-like
            The URL of the remote file.
        dest : str or path-like or None
            The path where the downloaded file is stored.
            if an existing directory is designated

        Returns
        -------
        Path
            The path of the stored file.

        Raises
        ------
        urllib.error.HTTPError
            When `the status code is not 200 or 30*.
        """
        with urllib.request.urlopen(url) as request_obj:
            real_file_name = os.path.basename(
                urllib.parse.urlparse(request_obj.geturl()).path
            )
            if dest is None:
                dest = real_file_name
            elif os.path.isdir(str(dest)):  # wrapping in str is for Python 3.5
                dest = type(dest)(os.path.join(str(dest), real_file_name))
            with open(str(dest), "wb") as file_obj:
                shutil.copyfileobj(request_obj, file_obj)
        return dest


class Downloader:
    """
    Class to execute all the processes to extract audio files.

    Parameters
    ----------
    config_path : Path
        Path of configuration file for downloading corpus
    user_option : UserOption
        User option designated in arguments of this program (e.g. verbose)
    """
    def __init__(self, config_path, user_option):
        with open(config_path) as f:
            self.all_configs = yaml.load(f)
        self.user_option = user_option
        self.global_config = GlobalConfiguration(
            self.all_configs.get("config", {})
        )
        self.files = [
            DataArchive(file_info_dic, self.global_config, user_option)
            for file_info_dic in self.all_configs["files"]
        ]

    def download(self, dest):
        """
        Downloads archives and extracts andplaces all audio files

        Parameters
        ----------
        dest : Path
            directory where audio files and directories are placed
        """
        for file in self.files:
            file.download(dest)


if __name__ == "__main__":
    args = docopt(__doc__)
    is_verbose = not args["--quiet"]  # Whether prints regular messages
    does_by_force = args["--force"]
    config_path = args["<config>"]

    base_dir = Path(__file__).parent
    wav_root_dir = base_dir / "data" / "wav"

    Downloader(config_path, UserOption(is_verbose, does_by_force)).download(
        wav_root_dir
    )
